Reproducible Bankruptcy Classifier Analysis

Libraries

library(h2o)
library(tidyverse)
library(plotly)

# Not in container
library(recipes)
library(embed)

Data

Bankruptcy Classification Features

data_prepared_tbl <- read_rds("00_data/data_prepared_tbl.rds")
data_prepared_tbl
## # A tibble: 4,998 x 64
##    class    Attr1 Attr2  Attr3 Attr4 Attr5  Attr6   Attr7 Attr8 Attr9 Attr10
##    <fct>    <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl> <dbl>  <dbl>
##  1 0      0.0882  0.555 0.0113  1.02 -66.5 0.342  0.109   0.578  1.09  0.320
##  2 0      0.130   0.221 0.578   3.61 120.  0.188  0.162   3.06   1.14  0.677
##  3 0      0.0482  0.550 0.108   1.24 -23.0 0      0.0593  0.817  1.52  0.450
##  4 0      0.0995  0.600 0.375   1.65  19.0 0.211  0.124   0.667  1.10  0.400
##  5 0      0.0785  0.205 0.104   2.79  77.8 0.365  0.0934  3.87   1.23  0.795
##  6 0      0.125   0.354 0.314   2.71  17.9 0.306  0.158   1.82   1.24  0.646
##  7 0      0.185   0.340 0.383   2.13  54.4 0.630  0.231   1.84   1.15  0.626
##  8 0      0.0905  0.314 0.425   3.21  24.9 0.0557 0.105   2.15   1.05  0.676
##  9 0     -0.00213 0.251 0.351   2.48  31.9 0.124  0.00590 2.34   1.06  0.586
## 10 0      0.136   0.296 0.477   2.61  70.9 0.414  0.169   2.34   1.16  0.692
## # … with 4,988 more rows, and 53 more variables: Attr11 <dbl>, Attr12 <dbl>,
## #   Attr13 <dbl>, Attr14 <dbl>, Attr15 <dbl>, Attr16 <dbl>, Attr17 <dbl>,
## #   Attr18 <dbl>, Attr19 <dbl>, Attr20 <dbl>, Attr21 <dbl>, Attr22 <dbl>,
## #   Attr23 <dbl>, Attr24 <dbl>, Attr25 <dbl>, Attr26 <dbl>, Attr27 <dbl>,
## #   Attr28 <dbl>, Attr29 <dbl>, Attr30 <dbl>, Attr31 <dbl>, Attr32 <dbl>,
## #   Attr33 <dbl>, Attr34 <dbl>, Attr35 <dbl>, Attr36 <dbl>, Attr38 <dbl>,
## #   Attr39 <dbl>, Attr40 <dbl>, Attr41 <dbl>, Attr42 <dbl>, Attr43 <dbl>,
## #   Attr44 <dbl>, Attr45 <dbl>, Attr46 <dbl>, Attr47 <dbl>, Attr48 <dbl>,
## #   Attr49 <dbl>, Attr50 <dbl>, Attr51 <dbl>, Attr52 <dbl>, Attr53 <dbl>,
## #   Attr54 <dbl>, Attr55 <dbl>, Attr56 <dbl>, Attr57 <dbl>, Attr58 <dbl>,
## #   Attr59 <dbl>, Attr60 <dbl>, Attr61 <dbl>, Attr62 <dbl>, Attr63 <dbl>,
## #   Attr64 <dbl>

Bankruptcy Classification Feature Descriptions

data_dictionary_raw_tbl <- read_rds("00_data/data_dictionary_raw_tbl.rds")

data_dictionary_tbl <- data_dictionary_raw_tbl %>%
    separate(
        `Attribute.Information:`,
        into = c("id", "desc"),
        sep = " ",
        extra = "merge"
    ) %>%
    mutate(id = str_replace(id, "X", "Attr"))

data_dictionary_tbl
## # A tibble: 64 x 2
##    id     desc                                                                  
##    <chr>  <chr>                                                                 
##  1 Attr1  net profit / total assets                                             
##  2 Attr2  total liabilities / total assets                                      
##  3 Attr3  working capital / total assets                                        
##  4 Attr4  current assets / short-term liabilities                               
##  5 Attr5  [(cash + short-term securities + receivables - short-term liabilities…
##  6 Attr6  retained earnings / total assets                                      
##  7 Attr7  EBIT / total assets                                                   
##  8 Attr8  book value of equity / total liabilities                              
##  9 Attr9  sales / total assets                                                  
## 10 Attr10 equity / total assets                                                 
## # … with 54 more rows

H2O Prediction Analysis

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 days 15 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    4 months and 1 day !!! 
##     H2O cluster name:           H2O_started_from_R_mdancho_hfm505 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.58 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.2 (2020-06-22)

Load Production Model

path <- file.path(rprojroot::find_rstudio_root_file(),
                  "00_production_model/PROD_H2O_MODEL")

h2o_model <- h2o.loadModel(path)

Make Predictions

predictions_tbl <- h2o.predict(h2o_model, newdata = as.h2o(data_prepared_tbl)) %>%
    as_tibble()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
predictions_tbl
## # A tibble: 4,998 x 3
##    predict    p0      p1
##    <fct>   <dbl>   <dbl>
##  1 0       0.994 0.00631
##  2 0       0.998 0.00206
##  3 0       0.987 0.0135 
##  4 0       0.993 0.00656
##  5 0       0.997 0.00335
##  6 0       0.997 0.00314
##  7 0       0.997 0.00260
##  8 0       0.997 0.00254
##  9 0       0.997 0.00276
## 10 0       0.997 0.00300
## # … with 4,988 more rows

Bankruptcy UMAP Visualization

Apply UMAP

recipe_spec <- recipe(class ~ ., data_prepared_tbl) %>%
    step_normalize(contains("Attr")) %>%
    step_umap(contains("Attr"), outcome = vars(class), num_comp = 3, seed = c(123, 123))

umap_data_tbl <- recipe_spec %>% prep() %>% juice()
umap_data_tbl
## # A tibble: 4,998 x 4
##    class  umap_1 umap_2 umap_3
##    <fct>   <dbl>  <dbl>  <dbl>
##  1 0     -1.59    2.67  0.828 
##  2 0      1.18    0.856 1.25  
##  3 0     -1.08    2.74  0.0844
##  4 0     -1.22    0.332 1.39  
##  5 0     -0.369   2.18  0.802 
##  6 0      0.198   1.89  1.08  
##  7 0      0.765   1.36  1.37  
##  8 0      0.633   0.486 2.25  
##  9 0     -0.0246  0.287 1.43  
## 10 0      0.558   1.27  1.15  
## # … with 4,988 more rows

Plotly Visualization

# Create tooltip/Hover ----
plot_data_tbl <- umap_data_tbl %>%
    bind_cols(
        data_prepared_tbl %>%
            rowid_to_column(var = ".id") %>%
            select(.id, Attr39, Attr56, Attr26, Attr22),
        predictions_tbl
    ) %>%
    mutate(tooltip = str_glue(
        "
        Company ID: {.id}
        Class: {class}
        Bankruptcy Probability: {scales::percent(p1, accuracy = 0.1)}
        Attr 39 Profit on Sales / Sales: {Attr39}
        Attr 26 (net profit + depreciation) / total liabilities: {Attr26}
        Attr 22 profit on operating activities / total assets: {Attr22}
        Attr56 (sales - cost of products sold) / sales: {Attr56}
        "
    ))

# Plotly Visualization ----
plot_data_tbl %>%
    plot_ly(x = ~ umap_1, y = ~ umap_2, z = ~ umap_3,
            color = ~ class, colors = c('#BF382A', '#0C4B8E'),
            hovertemplate = ~ tooltip) %>%
    add_markers(opacity = 0.5)